get_upper_tri = function(cormat){
cormat[lower.tri(cormat)] = NA
diag(cormat) = NA
return(cormat)
}
hard_thresh = function(R, th){
R_th = R
R_th[abs(R) <= th] = 0
return(R_th)
}
# The Nomic data are not publicly available
# Metadata
meta_data = read_csv("../data/nomic/nomic_meta.csv")
# Taxonomy
tax = read_tsv("../data/nomic/nomic_taxnomy.txt") %>%
arrange(otu_id)
otu_id = tax$otu_id
tax = data.frame(tax[, -1]) %>%
rowwise() %>%
dplyr::mutate_all(function(x) strsplit(x, "__")[[1]][2]) %>%
mutate(
species = ifelse(!is.na(species) & !is.na(genus),
paste(strsplit(genus, "")[[1]][1], species, sep = "."),
NA)) %>%
ungroup()
tax = as.matrix(tax)
rownames(tax) = otu_id
# Day 30
# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_30.txt")
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)
# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data30 = phyloseq(OTU, TAX, META)
# Aggregate to phylum level
phylum_data30 = aggregate_taxa(otu_data30, "phylum")
phylum_data30 = subset_taxa(phylum_data30, phylum != "Unknown")
# Aggregate to family level
family_data30 = aggregate_taxa(otu_data30, "family")
family_data30 = subset_taxa(family_data30, family != "Unknown")
# Day 120
# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_120.txt")
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)
# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data120 = phyloseq(OTU, TAX, META)
# Aggregate to phylum level
phylum_data120 = aggregate_taxa(otu_data120, "phylum")
phylum_data120 = subset_taxa(phylum_data120, phylum != "Unknown")
# Aggregate to family level
family_data120 = aggregate_taxa(otu_data120, "family")
family_data120 = subset_taxa(family_data120, family != "Unknown")
# Day 365
# OTU table
otu_table = read_tsv("../data/nomic/nomic_otu_365.txt")
sample_id = otu_table$`#SampleID`
otu_id = colnames(otu_table)[-1]
otu_table = t(otu_table[, -1])
otu_table = data.frame(otu_table)
rownames(otu_table) = otu_id
colnames(otu_table) = sample_id
otu_table = as.matrix(otu_table)
# OTU object
OTU = otu_table(otu_table, taxa_are_rows = TRUE)
META = sample_data(meta_data)
sample_names(META) = meta_data$`#SampleID`
TAX = tax_table(tax)
otu_data365 = phyloseq(OTU, TAX, META)
# Aggregate to phylum level
phylum_data365 = aggregate_taxa(otu_data365, "phylum")
phylum_data365 = subset_taxa(phylum_data365, phylum != "Unknown")
# Aggregate to family level
family_data365 = aggregate_taxa(otu_data365, "family")
family_data365 = subset_taxa(family_data365, family != "Unknown")
# Sample size
df_sample = data.frame(day30 = nsamples(otu_data30),
day120 = nsamples(otu_data120),
day365 = nsamples(otu_data365))
# Find the most abundant taxa across time
top_phylum30 = top_taxa(phylum_data30, n = 50)
top_phylum120 = top_taxa(phylum_data120, n = 50)
top_phylum365 = top_taxa(phylum_data365, n = 50)
top_family30 = top_taxa(family_data30, n = 50)
top_family120 = top_taxa(family_data120, n = 50)
top_family365 = top_taxa(family_data365, n = 50)
common_phylum = Reduce(intersect, list(day30 = top_phylum30,
day120 = top_phylum120,
day365 = top_phylum365))
common_phylum = common_phylum[!is.na(common_phylum)]
top_phylum = sort(common_phylum[1:10])
common_family = Reduce(intersect, list(day30 = top_family30,
day120 = top_family120,
day365 = top_family365))
common_family = common_family[!is.na(common_family)]
top_family = sort(common_family[1:10])
# top_phylum = c("Acidobacteria", "Actinobacteria", "Bacteroidetes", "Cyanobacteria", "Firmicutes",
# "Fusobacteria", "Proteobacteria", "Tenericutes", "TM7", "Verrucomicrobia")
# top_family = c("Bacteroidaceae", "Bifidobacteriaceae", "Clostridiaceae", "Enterobacteriaceae", "Fusobacteriaceae",
# "Lachnospiraceae", "Moraxellaceae", "Pasteurellaceae", "Ruminococcaceae", "Staphylococcaceae")
# Prevalence
# Phylum
qc_phylum = data.frame(matrix(NA, nrow = 10, ncol = 4))
colnames(qc_phylum) = c("phylum", "day30", "day120", "day365")
core_phylum30 = subset_taxa(phylum_data30, phylum %in% top_phylum)
core_phylum120 = subset_taxa(phylum_data120, phylum %in% top_phylum)
core_phylum365 = subset_taxa(phylum_data365, phylum %in% top_phylum)
qc_phylum$phylum = taxa_names(core_phylum30)
qc_phylum$day30 = apply(abundances(core_phylum30), 1, function(x) sum(x != 0)/nsamples(core_phylum30))
qc_phylum$day120 = apply(abundances(core_phylum120), 1, function(x) sum(x != 0)/nsamples(core_phylum120))
qc_phylum$day365 = apply(abundances(core_phylum365), 1, function(x) sum(x != 0)/nsamples(core_phylum365))
# Family
qc_family = data.frame(matrix(NA, nrow = 10, ncol = 4))
colnames(qc_family) = c("family", "day30", "day120", "day365")
core_family30 = subset_taxa(family_data30, family %in% top_family)
core_family120 = subset_taxa(family_data120, family %in% top_family)
core_family365 = subset_taxa(family_data365, family %in% top_family)
qc_family$family = taxa_names(core_family30)
qc_family$day30 = apply(abundances(core_family30), 1, function(x) sum(x != 0)/nsamples(core_family30))
qc_family$day120 = apply(abundances(core_family120), 1, function(x) sum(x != 0)/nsamples(core_family120))
qc_family$day365 = apply(abundances(core_family365), 1, function(x) sum(x != 0)/nsamples(core_family365))
# Outputs
saveRDS(df_sample, file = "../data/nomic/qc/df_sample.rds")
saveRDS(df_sample, file = "../data/nomic/qc/qc_phylum.rds")
saveRDS(df_sample, file = "../data/nomic/qc/qc_family.rds")
qc_sample = read_rds("../data/nomic/qc/df_sample.rds")
colnames(qc_sample) = c("Day 30", "Day 120", "Day 365")
datatable(qc_sample)
qc_phylum = read_rds("../data/nomic/qc/qc_phylum.rds") %>%
mutate_if(is.numeric, function(x) round(x, 2)) %>%
arrange(phylum)
colnames(qc_phylum) = c("Phylum", "Day 30", "Day 120", "Day 365")
datatable(qc_phylum)
qc_family = read_rds("../data/nomic/qc/qc_family.rds") %>%
mutate_if(is.numeric, function(x) round(x, 2)) %>%
arrange(family)
colnames(qc_family) = c("Family", "Day 30", "Day 120", "Day 365")
datatable(qc_family)
pseqs = list(data1 = c(otu_data30, core_phylum30),
data2 = c(otu_data120, core_phylum120),
data3 = c(otu_data365, core_phylum365))
pseudo = 0; prv_cut = 0.1; lib_cut = 0; corr_cut = 0.5
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; thresh_hard = 0; max_p_linear = 0.1; n_cl = 2
set.seed(123)
res_linear = secom_linear(pseqs, pseudo, prv_cut, lib_cut, corr_cut,
wins_quant, method, soft, thresh_len, n_cv,
thresh_hard, max_p_linear, n_cl)
R = 1000; max_p_dist = 0.1
set.seed(123)
res_dist = secom_dist(pseqs, pseudo, prv_cut, lib_cut, corr_cut,
wins_quant, R, thresh_hard, max_p_dist, n_cl)
saveRDS(res_linear, file = "../data/nomic/phylum/res_linear.rds")
saveRDS(res_dist, file = "../data/nomic/phylum/res_dist.rds")
# Day 30
# Bias
s_30 = res_linear$s_diff_hat$data1
# Raw abundances
raw_30 = abundances(phylum_data30)
raw_30 = raw_30[, names(s_30)]
# Corrected abundances
o = log(raw_30)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_30 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_30))
y_hat_30 = exp(y_hat_30)
# Day 120
# Bias
s_120 = res_linear$s_diff_hat$data2
# Raw abundances
raw_120 = abundances(phylum_data120)
raw_120 = raw_120[, names(s_120)]
# Corrected abundances
o = log(raw_120)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_120 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_120))
y_hat_120 = exp(y_hat_120)
# Day 365
# Bias
s_365 = res_linear$s_diff_hat$data3
# Raw abundances
raw_365 = abundances(phylum_data365)
raw_365 = raw_365[, names(s_365)]
# Corrected abundances
o = log(raw_365)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_365 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_365))
y_hat_365 = exp(y_hat_365)
phylum_keep = rownames(res_linear$y_hat)
phylum_keep_30 = phylum_keep[grepl("data1", phylum_keep)]
phylum_keep_30 = sort(unique(gsub("data1 - ", "", phylum_keep_30)))
idx = which(rownames(y_hat_30) %in% phylum_keep_30)
y_hat_30 = as.data.frame(y_hat_30[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_30[-idx, ], na.rm = TRUE)))))
phylum_keep_120 = phylum_keep[grepl("data2", phylum_keep)]
phylum_keep_120 = sort(unique(gsub("data2 - ", "", phylum_keep_120)))
idx = which(rownames(y_hat_120) %in% phylum_keep_120)
y_hat_120 = as.data.frame(y_hat_120[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_120[-idx, ], na.rm = TRUE)))))
phylum_keep_365 = phylum_keep[grepl("data3", phylum_keep)]
phylum_keep_365 = sort(unique(gsub("data3 - ", "", phylum_keep_365)))
idx = which(rownames(y_hat_365) %in% phylum_keep_365)
y_hat_365 = as.data.frame(y_hat_365[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_365[-idx, ], na.rm = TRUE)))))
y_hat = y_hat_30 %>%
rownames_to_column("phylum") %>%
mutate(day = "30") %>%
select(phylum, day, everything()) %>%
bind_rows(
y_hat_120 %>%
rownames_to_column("phylum") %>%
mutate(day = "120") %>%
select(phylum, day, everything())
) %>%
bind_rows(
y_hat_365 %>%
rownames_to_column("phylum") %>%
mutate(day = "365") %>%
select(phylum, day, everything())
)
saveRDS(y_hat, file = "../data/nomic/phylum/y_hat.rds")
res_linear = read_rds("../data/nomic/phylum/res_linear.rds")
res_dist = read_rds("../data/nomic/phylum/res_dist.rds")
# Data organization
# Linear relationships
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2),
metric = "Pearson")
# Distance relationships
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2),
metric = "Distance")
# Merge datasets
df_corr = df_linear %>%
bind_rows(
df_dist
)
phylum_level = sort(union(df_corr$var1, df_corr$var2))
phylum_label = sapply(phylum_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", phylum_level) ~ "#1B9E77",
grepl("data2", phylum_level) ~ "#D95F02",
TRUE ~ "#7570B3")
df_corr$var1 = factor(df_corr$var1, levels = phylum_level)
df_corr$var2 = factor(df_corr$var2, levels = phylum_level)
df_corr$metric = factor(df_corr$metric, levels = c("Pearson", "Distance"))
p_corr = df_corr %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, labels = phylum_label) +
scale_y_discrete(drop = FALSE, labels = phylum_label) +
facet_grid(rows = vars(metric)) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = NULL) +
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "right")) +
theme_bw() +
geom_vline(xintercept = c(4.5, 9.5), color = "blue", linetype = "dashed") +
geom_hline(yintercept = c(4.5, 9.5), color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank()) +
coord_fixed()
p_corr
y_hat = read_rds("../data/nomic/phylum/y_hat.rds")
df_abn = y_hat %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(ave = rowMeans(across(where(is.numeric)))) %>%
select(phylum, day, ave) %>%
filter(phylum != "Others")
df_abn$phylum = factor(df_abn$phylum)
df_abn$day = factor(df_abn$day, levels = c("30", "120", "365"))
p_abn = df_abn %>%
ggplot(aes(x = day, y = ave, fill = phylum)) +
geom_bar(position = "dodge", stat = "identity") +
labs(x = "Day", y = "Abundance") +
scale_fill_brewer(name = NULL, palette = "Set1", drop = FALSE) +
scale_x_discrete(drop = FALSE) +
facet_wrap(.~phylum, nrow = 2, scales = "free") +
theme_bw()
p_abn
y_hat = y_hat %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(total = rowSums(across(where(is.numeric))))
df_rel = y_hat %>%
filter(day == "30") %>%
mutate(prop = total/sum(total)) %>%
select(phylum, day, prop) %>%
bind_rows(
y_hat %>%
filter(day == "120") %>%
mutate(prop = total/sum(total)) %>%
select(phylum, day, prop)
) %>%
bind_rows(
y_hat %>%
filter(day == "365") %>%
mutate(prop = total/sum(total)) %>%
select(phylum, day, prop)
)
df_rel$phylum = factor(df_rel$phylum)
df_rel$phylum = forcats::fct_relevel(df_rel$phylum, "Others", after = Inf)
df_rel$day = factor(df_rel$day, levels = c("30", "120", "365"))
p_rel = df_rel %>%
ggplot(aes(x = day, y = prop, fill = phylum)) +
geom_bar(position = "stack", stat = "identity") +
labs(x = "Day", y = "Relative Abundance") +
scale_fill_brewer(name = NULL, palette = "Set1") +
theme_bw()
p_rel
df_rel = df_rel %>%
mutate(prop = round(prop, 2)) %>%
pivot_wider(names_from = "day", values_from = "prop") %>%
arrange(phylum)
datatable(df_rel)
# Combine abundance and relative abundance plots
p_abn = p_abn +
theme(legend.position = "none") +
labs(x = NULL)
p_combine = ggarrange(p_abn, p_rel, nrow = 2,
heights = c(1, 1.5), labels = c("a", "b"))
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_phylum.pdf",
height = 8, width = 10)
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_phylum.jpeg",
height = 8, width = 10, dpi = 300)
pseqs = list(data1 = c(otu_data30, core_family30),
data2 = c(otu_data120, core_family120),
data3 = c(otu_data365, core_family365))
pseudo = 0; prv_cut = 0.1; lib_cut = 0; corr_cut = 0.5
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; thresh_hard = 0; max_p_linear = 0.1; n_cl = 2
set.seed(123)
res_linear = secom_linear(pseqs, pseudo, prv_cut, lib_cut, corr_cut,
wins_quant, method, soft, thresh_len, n_cv,
thresh_hard, max_p_linear, n_cl)
R = 1000; max_p_dist = 0.1
set.seed(123)
res_dist = secom_dist(pseqs, pseudo, prv_cut, lib_cut, corr_cut,
wins_quant, R, thresh_hard, max_p_dist, n_cl)
saveRDS(res_linear, file = "../data/nomic/family/res_linear.rds")
saveRDS(res_dist, file = "../data/nomic/family/res_dist.rds")
# Day 30
# Bias
s_30 = res_linear$s_diff_hat$data1
# Raw abundances
raw_30 = abundances(family_data30)
raw_30 = raw_30[, names(s_30)]
# Corrected abundances
o = log(raw_30)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_30 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_30))
y_hat_30 = exp(y_hat_30)
# Day 120
# Bias
s_120 = res_linear$s_diff_hat$data2
# Raw abundances
raw_120 = abundances(family_data120)
raw_120 = raw_120[, names(s_120)]
# Corrected abundances
o = log(raw_120)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_120 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_120))
y_hat_120 = exp(y_hat_120)
# Day 365
# Bias
s_365 = res_linear$s_diff_hat$data3
# Raw abundances
raw_365 = abundances(family_data365)
raw_365 = raw_365[, names(s_365)]
# Corrected abundances
o = log(raw_365)
o[is.infinite(o)] = NA
d = nrow(o)
y_hat_365 = o - rowMeans(o, na.rm = TRUE) - t(replicate(d, s_365))
y_hat_365 = exp(y_hat_365)
family_keep = rownames(res_linear$y_hat)
family_keep_30 = family_keep[grepl("data1", family_keep)]
family_keep_30 = sort(unique(gsub("data1 - ", "", family_keep_30)))
idx = which(rownames(y_hat_30) %in% family_keep_30)
y_hat_30 = as.data.frame(y_hat_30[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_30[-idx, ], na.rm = TRUE)))))
family_keep_120 = family_keep[grepl("data2", family_keep)]
family_keep_120 = sort(unique(gsub("data2 - ", "", family_keep_120)))
idx = which(rownames(y_hat_120) %in% family_keep_120)
y_hat_120 = as.data.frame(y_hat_120[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_120[-idx, ], na.rm = TRUE)))))
family_keep_365 = family_keep[grepl("data3", family_keep)]
family_keep_365 = sort(unique(gsub("data3 - ", "", family_keep_365)))
idx = which(rownames(y_hat_365) %in% family_keep_365)
y_hat_365 = as.data.frame(y_hat_365[idx, ]) %>%
bind_rows(as.data.frame(t(data.frame(Others = colSums(y_hat_365[-idx, ], na.rm = TRUE)))))
y_hat = y_hat_30 %>%
rownames_to_column("family") %>%
mutate(day = "30") %>%
select(family, day, everything()) %>%
bind_rows(
y_hat_120 %>%
rownames_to_column("family") %>%
mutate(day = "120") %>%
select(family, day, everything())
) %>%
bind_rows(
y_hat_365 %>%
rownames_to_column("family") %>%
mutate(day = "365") %>%
select(family, day, everything())
)
saveRDS(y_hat, file = "../data/nomic/family/y_hat.rds")
res_linear = read_rds("../data/nomic/family/res_linear.rds")
res_dist = read_rds("../data/nomic/family/res_dist.rds")
# Data organization
# Linear relationships
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2),
metric = "Pearson")
# Distance relationships
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2),
metric = "Distance")
# Merge datasets
df_corr = df_linear %>%
bind_rows(
df_dist
)
family_level = sort(union(df_corr$var1, df_corr$var2))
family_label = sapply(family_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", family_level) ~ "#1B9E77",
grepl("data2", family_level) ~ "#D95F02",
TRUE ~ "#7570B3")
df_corr$var1 = factor(df_corr$var1, levels = family_level)
df_corr$var2 = factor(df_corr$var2, levels = family_level)
df_corr$metric = factor(df_corr$metric, levels = c("Pearson", "Distance"))
p_corr = df_corr %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, labels = family_label) +
scale_y_discrete(drop = FALSE, labels = family_label) +
facet_grid(rows = vars(metric)) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = NULL) +
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "right")) +
theme_bw() +
geom_vline(xintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
geom_hline(yintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank()) +
coord_fixed()
p_corr
ggsave(plot = p_corr, "../images/main/nomic_corr_family.pdf", height = 18, width = 16)
ggsave(plot = p_corr, "../images/main/nomic_corr_family.jpeg", height = 18, width = 16, dpi = 300)
y_hat = read_rds("../data/nomic/family/y_hat.rds")
df_abn = y_hat %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(ave = rowMeans(across(where(is.numeric)))) %>%
select(family, day, ave) %>%
filter(family != "Others")
df_abn$family = factor(df_abn$family)
df_abn$day = factor(df_abn$day, levels = c("30", "120", "365"))
p_abn = df_abn %>%
ggplot(aes(x = day, y = ave, fill = family)) +
geom_bar(position = "dodge", stat = "identity") +
labs(x = "Day", y = "Abundance") +
scale_fill_brewer(name = NULL, palette = "Paired", drop = FALSE) +
scale_x_discrete(drop = FALSE) +
facet_wrap(.~family, nrow = 2, scales = "free") +
theme_bw()
p_abn
y_hat = y_hat %>%
mutate_if(is.numeric, ~replace_na(., 0)) %>%
mutate(total = rowSums(across(where(is.numeric))))
df_rel = y_hat %>%
filter(day == "30") %>%
mutate(prop = total/sum(total)) %>%
select(family, day, prop) %>%
bind_rows(
y_hat %>%
filter(day == "120") %>%
mutate(prop = total/sum(total)) %>%
select(family, day, prop)
) %>%
bind_rows(
y_hat %>%
filter(day == "365") %>%
mutate(prop = total/sum(total)) %>%
select(family, day, prop)
)
df_rel$family = factor(df_rel$family)
df_rel$family = forcats::fct_relevel(df_rel$family, "Others", after = Inf)
df_rel$day = factor(df_rel$day, levels = c("30", "120", "365"))
p_rel = df_rel %>%
ggplot(aes(x = day, y = prop, fill = family)) +
geom_bar(position = "stack", stat = "identity") +
labs(x = "Day", y = "Relative Abundance") +
scale_fill_brewer(name = NULL, palette = "Paired") +
theme_bw()
p_rel
df_rel = df_rel %>%
mutate(prop = round(prop, 2)) %>%
pivot_wider(names_from = "day", values_from = "prop") %>%
arrange(family)
datatable(df_rel)
# Combine abundance and relative abundance plots
p_abn = p_abn +
theme(legend.position = "none") +
labs(x = NULL)
p_combine = ggarrange(p_abn, p_rel, nrow = 2,
heights = c(1, 1.5), labels = c("a", "b"))
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_family.pdf",
height = 8, width = 10)
ggsave(plot = p_combine, "../images/supp/supp_nomic_count_family.jpeg",
height = 8, width = 10, dpi = 300)
Enterobacteriaceae vs. Ruminococcaceae at day 120
df_eg = data.frame(x = res_linear$y_hat["data2 - Enterobacteriaceae", ],
y = res_linear$y_hat["data2 - Ruminococcaceae", ])
df_eg = df_eg[complete.cases(df_eg), ]
rlm1 = lmrob(y ~ x, data = df_eg, control = lmrob.control(max.it = 100))
rlm2 = lmrob(y ~ poly(x, 4), data = df_eg, control = lmrob.control(max.it = 100))
r_squared1 = round(summary(rlm1)$adj.r.squared, 2)
r_squared2 = round(summary(rlm2)$adj.r.squared, 2)
df_ann = data.frame(x = c(0.75, 0.75), y = c(1.75, 1.25)) %>%
mutate(label = c(paste0("R^{2}~(linear) == ", r_squared1),
paste0("R^{2}~(Poly) == ", r_squared2)))
scatter_eg = df_eg %>%
ggplot(aes(x = x, y = y)) +
geom_point(size = 4) +
geom_text(data = df_ann, aes(x = x, y = y, label = label), hjust = 0,
color = "orange", size = 4, fontface = "bold", parse = TRUE) +
labs(x = "Enterobacteriaceae", y = "Ruminococcaceae") +
geom_smooth(method = "lm", formula = y ~ poly(x, 4), se = FALSE, linetype = "dashed") +
theme_bw()
scatter_eg
ggsave(plot = scatter_eg, "../images/supp/supp_nonlinear_example2.pdf", height = 5, width = 6.25)
ggsave(plot = scatter_eg, "../images/supp/supp_nonlinear_example2.jpeg", height = 5, width = 6.25, dpi = 300)
raw_30 = abundances(family_data30)
rownames(raw_30) = paste0("data1 - ", rownames(raw_30))
raw_120 = abundances(family_data120)
rownames(raw_120) = paste0("data2 - ", rownames(raw_120))
raw_365 = abundances(family_data365)
rownames(raw_365) = paste0("data3 - ", rownames(raw_365))
raw_30_120 = data.frame(raw_30, check.names = FALSE) %>%
bind_rows(data.frame(raw_120, check.names = FALSE)) %>%
select_if(~ !any(is.na(.)))
raw_30_365 = data.frame(raw_30, check.names = FALSE) %>%
bind_rows(data.frame(raw_365, check.names = FALSE)) %>%
select_if(~ !any(is.na(.)))
raw_120_365 = data.frame(raw_120, check.names = FALSE) %>%
bind_rows(data.frame(raw_365, check.names = FALSE)) %>%
select_if(~ !any(is.na(.)))
# SparCC cannot handle missing values, so we have to calculate the correlation
# matrices separately
set.seed(123)
R_hat_sparcc11 = sparcc(t(raw_30))$Cor
R_hat_sparcc22 = sparcc(t(raw_120))$Cor
R_hat_sparcc33 = sparcc(t(raw_365))$Cor
R_hat_sparcc12 = sparcc(t(raw_30_120))$Cor
R_hat_sparcc12 = R_hat_sparcc12[seq_len(nrow(raw_30)),
seq(from = nrow(raw_30) + 1,
to = nrow(raw_30_120))]
R_hat_sparcc13 = sparcc(t(raw_30_365))$Cor
R_hat_sparcc13 = R_hat_sparcc13[seq_len(nrow(raw_30)),
seq(from = nrow(raw_30) + 1,
to = nrow(raw_30_365))]
R_hat_sparcc23 = sparcc(t(raw_120_365))$Cor
R_hat_sparcc23 = R_hat_sparcc23[seq_len(nrow(raw_120)),
seq(from = nrow(raw_120) + 1,
to = nrow(raw_120_365))]
R_hat_sparcc = matrix(NA,
nrow = nrow(raw_30) + nrow(raw_120) + nrow(raw_365),
ncol = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))
R_hat_sparcc[seq_len(nrow(raw_30)), seq_len(nrow(raw_30))] = R_hat_sparcc11
R_hat_sparcc[seq_len(nrow(raw_30)),
seq(from = nrow(raw_30) + 1,
to = nrow(raw_30) + nrow(raw_120))] = R_hat_sparcc12
R_hat_sparcc[seq_len(nrow(raw_30)),
seq(from = nrow(raw_30) + nrow(raw_120) + 1,
to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc13
R_hat_sparcc[seq(from = nrow(raw_30) + 1,
to = nrow(raw_30) + nrow(raw_120)),
seq(from = nrow(raw_30) + 1,
to = nrow(raw_30) + nrow(raw_120))] = R_hat_sparcc22
R_hat_sparcc[seq(from = nrow(raw_30) + 1,
to = nrow(raw_30) + nrow(raw_120)),
seq(from = nrow(raw_30) + nrow(raw_120) + 1,
to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc23
R_hat_sparcc[seq(from = nrow(raw_30) + nrow(raw_120) + 1,
to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365)),
seq(from = nrow(raw_30) + nrow(raw_120) + 1,
to = nrow(raw_30) + nrow(raw_120) + nrow(raw_365))] = R_hat_sparcc33
R_hat_sparcc[lower.tri(R_hat_sparcc)] = R_hat_sparcc[upper.tri(R_hat_sparcc)]
dimnames(R_hat_sparcc) = list(c(rownames(raw_30), rownames(raw_120), rownames(raw_365)),
c(rownames(raw_30), rownames(raw_120), rownames(raw_365)))
saveRDS(R_hat_sparcc, file = "../data/nomic/sparcc/R_hat_sparcc.rds")
R_hat_sparcc = read_rds("../data/nomic/sparcc/R_hat_sparcc.rds")
R_hat_sparcc = hard_thresh(R_hat_sparcc, 0.3)
col_ind = match(family_level, colnames(R_hat_sparcc))
R_hat_sparcc = R_hat_sparcc[col_ind[!is.na(col_ind)], col_ind[!is.na(col_ind)]]
df_sparcc = data.frame(get_upper_tri(R_hat_sparcc)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
mutate(var2 = gsub("\\...", " - ", var2)) %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
family_level = sort(union(df_sparcc$var1, df_sparcc$var2))
family_label = sapply(family_level, function(x) strsplit(x, " - ")[[1]][2])
txt_color = case_when(grepl("data1", family_level) ~ "#1B9E77",
grepl("data2", family_level) ~ "#D95F02",
TRUE ~ "#7570B3")
df_sparcc$var1 = factor(df_sparcc$var1, levels = family_level)
df_sparcc$var2 = factor(df_sparcc$var2, levels = family_level)
p_sparcc = df_sparcc %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE, labels = family_label) +
scale_y_discrete(drop = FALSE, labels = family_label) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = NULL) +
guides(fill = guide_colorbar(barwidth = 1, barheight = 7,
title.position = "right")) +
theme_bw() +
geom_vline(xintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
geom_hline(yintercept = c(8.5, 15.5), color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic", color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank()) +
coord_fixed()
p_sparcc
ggsave(plot = p_sparcc, "../images/supp/supp_nomic_sparcc.pdf", height = 10, width = 13)
ggsave(plot = p_sparcc, "../images/supp/supp_nomic_sparcc.jpeg", height = 10, width = 13, dpi = 300)
sessionInfo()
R version 4.2.0 (2022-04-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur/Monterey 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] DT_0.23 robustbase_0.95-0 openxlsx_4.2.5
[4] SpiecEasi_1.1.2 ggpubr_0.4.0 randomcoloR_1.1.0.1
[7] ANCOMBC_1.6.2 microbiome_1.18.0 phyloseq_1.40.0
[10] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9
[13] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
[16] tibble_3.1.7 ggplot2_3.3.6 tidyverse_1.3.1
loaded via a namespace (and not attached):
[1] readxl_1.4.0 backports_1.4.1 Hmisc_4.7-0
[4] VGAM_1.1-6 plyr_1.8.7 igraph_1.3.2
[7] splines_4.2.0 crosstalk_1.2.0 GenomeInfoDb_1.32.2
[10] digest_0.6.29 foreach_1.5.2 htmltools_0.5.2
[13] fansi_1.0.3 magrittr_2.0.3 checkmate_2.1.0
[16] cluster_2.1.3 doParallel_1.0.17 tzdb_0.3.0
[19] Biostrings_2.64.0 modelr_0.1.8 jpeg_0.1-9
[22] colorspace_2.0-3 rvest_1.0.2 haven_2.5.0
[25] rbibutils_2.2.8 xfun_0.31 crayon_1.5.1
[28] RCurl_1.98-1.7 jsonlite_1.8.0 Exact_3.1
[31] survival_3.3-1 iterators_1.0.14 ape_5.6-2
[34] glue_1.6.2 gtable_0.3.0 zlibbioc_1.42.0
[37] XVector_0.36.0 V8_4.2.0 car_3.1-0
[40] Rhdf5lib_1.18.2 shape_1.4.6 DEoptimR_1.0-11
[43] BiocGenerics_0.42.0 abind_1.4-5 scales_1.2.0
[46] mvtnorm_1.1-3 DBI_1.1.3 rngtools_1.5.2
[49] rstatix_0.7.0 Rcpp_1.0.8.3 htmlTable_2.4.0
[52] foreign_0.8-82 proxy_0.4-27 Formula_1.2-4
[55] stats4_4.2.0 glmnet_4.1-4 htmlwidgets_1.5.4
[58] httr_1.4.3 pulsar_0.3.7 RColorBrewer_1.1-3
[61] ellipsis_0.3.2 farver_2.1.0 pkgconfig_2.0.3
[64] nnet_7.3-17 sass_0.4.1 dbplyr_2.2.0
[67] utf8_1.2.2 labeling_0.4.2 tidyselect_1.1.2
[70] rlang_1.0.2 reshape2_1.4.4 munsell_0.5.0
[73] cellranger_1.1.0 tools_4.2.0 cli_3.3.0
[76] generics_0.1.2 ade4_1.7-19 broom_0.8.0
[79] evaluate_0.15 biomformat_1.24.0 fastmap_1.1.0
[82] yaml_2.3.5 knitr_1.39 fs_1.5.2
[85] zip_2.2.0 rootSolve_1.8.2.3 nlme_3.1-158
[88] doRNG_1.8.2 xml2_1.3.3 compiler_4.2.0
[91] rstudioapi_0.13 curl_4.3.2 png_0.1-7
[94] ggsignif_0.6.3 e1071_1.7-11 huge_1.3.5
[97] reprex_2.0.1 bslib_0.3.1 DescTools_0.99.45
[100] stringi_1.7.6 gsl_2.1-7.1 highr_0.9
[103] lattice_0.20-45 Matrix_1.4-1 nloptr_2.0.3
[106] vegan_2.6-2 permute_0.9-7 multtest_2.52.0
[109] vctrs_0.4.1 pillar_1.7.0 lifecycle_1.0.1
[112] rhdf5filters_1.8.0 Rdpack_2.3.1 jquerylib_0.1.4
[115] cowplot_1.1.1 data.table_1.14.2 bitops_1.0-7
[118] lmom_2.9 R6_2.5.1 latticeExtra_0.6-29
[121] gridExtra_2.3 IRanges_2.30.0 gld_2.6.4
[124] codetools_0.2-18 boot_1.3-28 energy_1.7-10
[127] MASS_7.3-57 assertthat_0.2.1 rhdf5_2.40.0
[130] withr_2.5.0 S4Vectors_0.34.0 GenomeInfoDbData_1.2.8
[133] mgcv_1.8-40 expm_0.999-6 parallel_4.2.0
[136] hms_1.1.1 grid_4.2.0 rpart_4.1.16
[139] class_7.3-20 rmarkdown_2.14 carData_3.0-5
[142] Rtsne_0.16 Biobase_2.56.0 lubridate_1.8.0
[145] base64enc_0.1-3